setwd("/Volumes/turnbaughlab/labshare/vaibhav_metastudy_Mar2017/pubmedsearch/")
knitr::opts_chunk$set(echo = TRUE, message=TRUE, warning=FALSE, tidy=FALSE, cache=FALSE)
library(readr)
library(ggplot2)
library(plotly)
library(stringr)
library(MicrobeR)
library(reshape2)
library(knitr)
library(plyr)
library(ape)

Data Import

NREV<-12 #number of reviewers, get the last reviewer and increase to 12
Reviews<-list()
for (i in 1:NREV){Reviews[[paste0("Reviewer_", i)]]<-suppressMessages(read_tsv(paste0("Aug24_2017_reviews/",i,"_decisions.txt")))}

Reviewer Stats

First generate some stats to plot out.

ReviewerStats<-data.frame(ReviewerID=names(Reviews), stringsAsFactors = F)
ReviewerStats$AcceptanceRate<-sapply(Reviews, function(x) 100*sum(x$Decision=="Include")/length(x$Decision))

ReviewerStats$TotalTime<-sapply(Reviews, function(x) {
        times<-parse_time(str_split(x$TimeStamp, " ", simplify = T)[,4])
        difftime(times[length(times)], times[1], units="mins")
      })

ReviewerStats$AverageTime<-sapply(Reviews, function(x) {
        times<-parse_time(str_split(x$TimeStamp, " ", simplify = T)[,4])
        mean(diff(times, units="seconds"))
      })

ReviewerStats$MedianTime<-sapply(Reviews, function(x) {
        times<-parse_time(str_split(x$TimeStamp, " ", simplify = T)[,4])
        median(diff(times, units="seconds"))
      })

ReviewerStats$MinTime<-sapply(Reviews, function(x) {
        times<-parse_time(str_split(x$TimeStamp, " ", simplify = T)[,4])
        min(diff(times, units="seconds"))
      })

ReviewerStats$MaxTime<-sapply(Reviews, function(x) {
        times<-parse_time(str_split(x$TimeStamp, " ", simplify = T)[,4])
        max(diff(times, units="seconds"))
      })

Most accepting and least accepting

ReviewerStats<-ReviewerStats[order(ReviewerStats$AcceptanceRate, decreasing = T),]

  ggplot(ReviewerStats, aes(x=factor(ReviewerID, levels=ReviewerStats$ReviewerID), y=AcceptanceRate)) +
  geom_bar(stat="identity") +
  theme_bw() +
  ylab("Acceptance Rate %") +
  xlab("Reviewer ID") +
  ggtitle("Overall Acceptance Rate per Reviewer") +
  theme(axis.text.x = element_text(angle=45, hjust=1)) +
  scale_y_continuous(limits=c(0,30), breaks=seq(0,30, 5))

  ggplot(ReviewerStats, aes(x=AcceptanceRate)) +
  geom_freqpoly(binwidth=4) +
  theme_bw() +
  ylab("Number of Reviewers") +
  xlab("Acceptance Rate %") +
  ggtitle("Overall Acceptance Rate per Reviewer") +
  theme(axis.text.x = element_text(angle=45, hjust=1))

The award for biggest hater goes to lucky Reviewer 7 at 10.95% and the most accepting to Reviewer 4 at 27.40%

Fastest and slowest

ReviewerStats<-ReviewerStats[order(ReviewerStats$TotalTime, decreasing = T),]

#excluding 12 who took breaks during
  ggplot(subset(ReviewerStats, ReviewerID!="Reviewer_12"), aes(x=factor(ReviewerID, levels=ReviewerStats$ReviewerID), y=TotalTime)) +
  geom_bar(stat="identity") +
  theme_bw() +
  ylab("Time for all reviews (min)") +
  xlab("Reviewer ID") +
  ggtitle("Total time for completion") +
  theme(axis.text.x = element_text(angle=45, hjust=1))

ReviewerStats<-ReviewerStats[order(ReviewerStats$AverageTime, decreasing = T),]

toplot<-melt(subset(ReviewerStats, ReviewerID!="Reviewer_12"), id.vars=c("ReviewerID"), variable.name="Metric", value.name="Time")
toplot<-subset(toplot, Metric!="AcceptanceRate" & Metric!="TotalTime")

  ggplot(toplot, aes(x=factor(ReviewerID, levels=ReviewerStats$ReviewerID), y=Time, group=Metric, color=Metric)) +
  geom_line() +
  theme_bw() +
  ylab("Time (seconds)") +
  xlab("Reviewer ID") +
  ggtitle("Time spent per abstract") +
  theme(axis.text.x = element_text(angle=45, hjust=1))

Reviewer 7 took the most time and care in their reviews while 11 was the most rushed. But how does this compare to acceptance rate?

ggplot(subset(ReviewerStats, ReviewerID!="Reviewer_12"), aes(x=AverageTime, y=AcceptanceRate, label=ReviewerID, color=ReviewerID)) + 
  geom_point() +
  geom_text(vjust=1, hjust=0) +
  theme_bw() +
  xlab("Average time per review (sec)") +
  ylab("Acceptance Rate (%)") +
  theme(legend.position="none") +
  ggtitle("Relationship between Time and Acceptance") +
  scale_x_continuous(expand=c(0,4))

cor.test(ReviewerStats$AverageTime, ReviewerStats$AcceptanceRate, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  ReviewerStats$AverageTime and ReviewerStats$AcceptanceRate
## t = -0.70313, df = 10, p-value = 0.498
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7033390  0.4076294
## sample estimates:
##        cor 
## -0.2170483

It would appear there is a weak connection between time spent reviewing and acceptance rate


Degree of consensus

MasterList<-suppressMessages(read_csv("ReviewAbstracts/AssignedReviewers.csv"))



MasterList$ReviewerA_Decision<-mapply(function(pmid, rev) 
      {
        subset(Reviews[[paste0("Reviewer_",rev)]], PMID==pmid)$Decision
      },MasterList$PMID, MasterList$ReviewerA )

MasterList$ReviewerB_Decision<-mapply(function(pmid, rev) 
      {
        subset(Reviews[[paste0("Reviewer_",rev)]], PMID==pmid)$Decision
      },MasterList$PMID, MasterList$ReviewerB )

c<-table(paste(MasterList$ReviewerA_Decision, MasterList$ReviewerB_Decision))
consensus<-
  data.frame(Decision=c("Consensus on Exclude", "Consensus on Include", "Conflicting Opinion"),
             Frequency=c(c["Exclude Exclude"], c["Include Include"], sum(c["Exclude Include"], c["Include Exclude"]))
  )
rownames(consensus)<-NULL
kable(consensus)
Decision Frequency
Consensus on Exclude 317
Consensus on Include 45
Conflicting Opinion 64
consensus$Percentage<-round(100*consensus$Frequency/sum(consensus$Frequency))
ggplot(consensus, aes(x="", y=Percentage, fill=Decision)) +
    geom_bar(stat="identity") +
    coord_polar("y", start=0) +
    theme_minimal()

MasterList$FinalOpinion<-apply(MasterList, 1, function(x){
       if(x["ReviewerA_Decision"]=="Include" & x["ReviewerB_Decision"]=="Include"){return("Consensus_Include")}
  else if(x["ReviewerA_Decision"]=="Exclude" & x["ReviewerB_Decision"]=="Exclude"){return("Consensus_Exclude")}
  else {return("Conflicting")}
    })

Extent of disagreement by person

#dis<-as.data.frame(t(combn(unique(MasterList$ReviewerA), 2)))
dis<-expand.grid(unique(MasterList$ReviewerA), unique(MasterList$ReviewerB))
colnames(dis)<-c("Reviewer1", "Reviewer2")
dis$N.overlap<-apply(dis, 1, function(x) {
  nrow( subset(MasterList, (ReviewerA==x[1] & ReviewerB==x[2]) |  (ReviewerA==x[2] & ReviewerB==x[1]) ) )
  })

dis$N.conflict<-apply(dis, 1, function(x) {
  nrow( subset(MasterList, FinalOpinion=="Conflicting" & ((ReviewerA==x[1] & ReviewerB==x[2]) |  (ReviewerA==x[2] & ReviewerB==x[1])) ) )
  })

dismets<-dis
dis<-ddply(dis, "Reviewer1", summarize, N.conflict=sum(N.conflict) , N.overlap=sum(N.overlap))
dis$Reviewer1<-paste0("Reviewer_", dis$Reviewer1)
dis$Percent.conflict<-round(100*(dis$N.conflict/dis$N.overlap),2)
dis<-dis[order(dis$Percent.conflict),]
dis$Rank<-1:nrow(dis)

kable(dis)
Reviewer1 N.conflict N.overlap Percent.conflict Rank
7 Reviewer_7 7 73 9.59 1
3 Reviewer_3 7 71 9.86 2
12 Reviewer_12 8 72 11.11 3
5 Reviewer_5 9 71 12.68 4
6 Reviewer_6 10 72 13.89 5
1 Reviewer_1 10 71 14.08 6
10 Reviewer_10 12 72 16.67 7
2 Reviewer_2 12 71 16.90 8
11 Reviewer_11 12 69 17.39 9
8 Reviewer_8 12 68 17.65 10
9 Reviewer_9 14 69 20.29 11
4 Reviewer_4 15 73 20.55 12

Reviewer 7 was the most agreed upon reviewer, while Reviewer 4 had the disagreement.


Finalized Decisions

Nice.Table(MasterList)
write_csv(MasterList, "Concensus_table.csv")